In the first analysis we utilize the raw counts. As such this network will be built using relative abundance data.
# Set seed
set.seed(777)
# Import data
otus <- as.matrix(read.csv2("16S_data/OTU_table_raw.csv", fill = TRUE, header = TRUE,
row.names = 1))
taxa <- tax_table(as.matrix(read.csv2("16S_data/tax_table_raw.csv", fill = TRUE, header = TRUE,
row.names = 1))
)
phy_df <- phyloseq(otu_table(otus, taxa_are_rows = FALSE), taxa)
# filterobj <- filterTaxonMatrix(otus, minocc = 20,
# keepSum = TRUE, return.filtered.indices = TRUE)
# otus.f <- filterobj$mat
# taxa.f <- taxa[setdiff(1:nrow(taxa), filterobj$filtered.indices),]
# dummyTaxonomy <- colnames(tax_df); dummyTaxonomy[1] <- "Kingdom_dummy"
# taxa.f <- rbind(taxa.f, dummyTaxonomy)
# rownames(taxa.f)[nrow(taxa.f)] <- "0"
# rownames(otus.f)[nrow(otus.f)] <- "0"
#
# # Next, we assemble a new phyloseq object with the filtered OTU and taxonomy tables.
# updatedotus <- otu_table(otus.f, taxa_are_rows = TRUE)
# updatedtaxa <- tax_table(taxa.f)
# phyloseqobj.f <- phyloseq(updatedotus, updatedtaxa)
# Prevalence filtering
phy_df_filtered <- filter_taxa(phy_df, function(x) sum(x > 30) > (0.25*length(x)), TRUE)
sp_easi <- spiec.easi(phy_df_filtered, method='mb', lambda.min.ratio=1e-2,
nlambda=20, icov.select.params=list(rep.num=50))## Normalizing/clr transformation of data with pseudocount ...
## Inverse Covariance Estimation with mb ...
## Model selection with stars ...
## Done!
ig.mb <- adj2igraph(sp_easi$refit, vertex.attr = list(name=taxa_names(phy_df_filtered)))
vsize <- Biobase::rowMedians(clr(otu_table(phy_df_filtered), 1))+15
Lineage_rel <- tax_table(phy_df_filtered)[,"Lineage"]
Lineage_rel <- factor(Lineage_rel, levels = unique(Lineage_rel))
vweights <- summary(symBeta(getOptBeta(sp_easi), mode='maxabs'))
MAGs <- c(); MAGs[taxa_names(phy_df_filtered)=="Otu00001"] <- "Ramlibacter MAG"
MAGs[taxa_names(phy_df_filtered)=="Otu00002"] <- "Bacteroidetes sp. MAG1"
MAGs[taxa_names(phy_df_filtered)=="Otu00003"] <- "Bacteroidetes sp. MAG2"
MAGs[is.na(MAGs)] <- ""
# png(file = "./Figures/Figures_network/NETWORK-REL-CX-C30-A25.png", width = 9, height = 9, res = 500, units = "in")
plot_network_custom(ig.mb, phy_df_filtered, type='taxa',
line_weight = 2, hjust = 0.5,
point_size = 0.1, alpha = 0.01, label_size = 3.95)+
# scale_fill_brewer(palette = "Paired")+
# scale_color_brewer(palette = "Paired")+
scale_fill_manual(values = c("#e2a2fd", brewer.pal(n = 12, "Paired")[c(3:8,1:2,11,12)]) )+
geom_point(aes(size = vsize, fill = Lineage_rel), alpha = 0.5,
colour="black", shape=21)+
guides(size = FALSE,
fill = guide_legend(title = "Lineage", override.aes = list(size = 5),
nrow = 4),
color = FALSE)+
theme(legend.position="bottom", legend.text=element_text(size=12),
text = element_text(size = 12),
plot.margin = unit(c(1,1,1,1), "cm"))+
scale_size(range = c(5, 15))+
geom_label_repel(aes(label = MAGs), fontface = 'bold', color = 'black',
box.padding = 0.35, point.padding = 0.5,
segment.color = 'black',
size = 4,
# Width of the line segments.
segment.size = 1.5,
# Draw an arrow from the label to the data point.
arrow = arrow(length = unit(0.015, 'npc')),
nudge_x = -0.1,
nudge_y = 0.6
)# dev.off()The second network will be built using absolute abundance data by multiplying the relative taxon abundances by the total cell density. The final obtained counts will expressed as nr. of cells measured in 50 µL samples. We also only consider the OTUs that were left after the prevalence filtering conducted in the network construction with relative abundance data.
# Import cell count data
cell_counts <- read.csv("16S_data/cell_counts.csv")
cell_counts$sample_title <- as.factor(cell_counts$sample_title)
# Import metadata
meta_16S <- read.csv("16s_data/Metadata.csv")[1:77,]; rownames(meta_16S) <- meta_16S$sample_title
# Calculate proportions
phy_df_rel <- transform_sample_counts(phy_df, function(x) x/sum(x))
# Add metadata
sample_data(phy_df_rel) <- sample_data(meta_16S)
# Select samples for which corresponding counts are available
cell_counts <- cell_counts[cell_counts$sample_title %in% sample_names(phy_df_rel), ]
cell_counts <- droplevels(cell_counts)
phy_df_rel <- prune_samples(sample_names(phy_df_rel) %in% cell_counts$sample_title, phy_df_rel)
# Multiply with cell counts in 50 µL of sample
otu_table(phy_df_rel) <- otu_table(phy_df_rel) * cell_counts$Number_of_cells
# Select taxa that were selected based on prevalence in previous chunk
phy_df_abs <- prune_taxa(taxa_names(phy_df_filtered), phy_df_rel)
# Round absolute abundances to integers
otu_table(phy_df_abs) <- round(otu_table(phy_df_abs), 0)
# Construct network
sp_easi_abs <- spiec.easi(phy_df_abs, method='mb', lambda.min.ratio=1e-2,
nlambda=20, icov.select.params=list(rep.num=50))## Normalizing/clr transformation of data with pseudocount ...
## Inverse Covariance Estimation with mb ...
## Model selection with stars ...
## Done!
ig.mb_abs <- adj2igraph(sp_easi_abs$refit, vertex.attr = list(name=taxa_names(phy_df_abs)))
vsize_abs <- Biobase::rowMedians(clr(otu_table(phy_df_abs), 1))+15
Lineage_abs <- tax_table(phy_df_abs)[,"Lineage"]
Lineage_abs <- factor(Lineage_abs, levels = unique(Lineage_abs))
vweights_abs <- summary(symBeta(getOptBeta(sp_easi_abs), mode='maxabs'))
MAGs <- c(); MAGs[taxa_names(phy_df_abs)=="Otu00001"] <- "Ramlibacter sp. MAG"
MAGs[taxa_names(phy_df_abs)=="Otu00002"] <- "Bacteroidetes sp. MAG1"
MAGs[taxa_names(phy_df_abs)=="Otu00003"] <- "Bacteroidetes sp. MAG2"
MAGs[is.na(MAGs)] <-""
# Plot network inferred from absolute abundances
# png(file = "./Figures/Figures_network/NETWORK-ABS-CX-C30-A25.png", width = 9, height = 9, res = 500, units = "in")
plot_network_custom(ig.mb_abs, phy_df_abs, type='taxa',
line_weight = 2, hjust = 0.5,
point_size = 0.1, alpha = 0.01, label_size = 3.95)+
# scale_fill_brewer(palette = "Paired")+
# scale_color_brewer(palette = "Paired")+
scale_fill_manual(values = c("#e2a2fd", brewer.pal(n = 12, "Paired")[c(3:8,1:2,11,12)]) )+
geom_point(aes(size = vsize_abs, fill = Lineage_abs), alpha = 0.5,
colour="black", shape=21)+
guides(size = FALSE,
fill = guide_legend(title = "Lineage", override.aes = list(size = 5),
nrow = 4),
color = FALSE)+
theme(legend.position="bottom", legend.text=element_text(size=12),
text = element_text(size = 12),
plot.margin = unit(c(1,1,1,1), "cm"))+
scale_size(range = c(5, 15))+
geom_label_repel(aes(label = MAGs), fontface = 'bold', color = 'black',
box.padding = 0.35, point.padding = 0.5,
segment.color = 'black',
size = 4,
# Width of the line segments.
segment.size = 1.5,
# Draw an arrow from the label to the data point.
arrow = arrow(length = unit(0.015, 'npc')),
nudge_x = -0.1,
nudge_y = 0.6
)# dev.off()# Plot absolute OTU dynamics of OTU1
df_abs <- psmelt(phy_df_abs)
p_abs_otu1 <- df_abs %>% dplyr::filter(OTU == "Otu00001") %>%
ggplot(aes(x = Timepoint, y = Abundance))+
facet_grid(.~Reactor.cycle)+
scale_shape_manual(values = c(21,24))+
geom_line(size = 1.5, linetype = 2, color = adjustcolor("#000000", 0.5))+
geom_point(size = 4, fill = "#e2a2fd", aes(shape = Reactor_status), alpha = 0.8,
color = "black")+
theme_bw()+
ylab(expression("Otu00001 abundance - cells mL"^"-1"))+
xlab("Time relative to reactor start - days")+
scale_y_continuous(breaks = seq(0,50e3,5e3), limits = c(0,30e3))+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
strip.text.x=element_text(size=18),
legend.position = "top")+
guides(shape = guide_legend(title="Reactor status", ncol =1))
print(p_abs_otu1)# Read data
mean_coverage <- read.table("./SAMPLES-SUMMARY/bins_across_samples/mean_coverage.txt", header = TRUE)
std_coverage <- read.table("./SAMPLES-SUMMARY/bins_across_samples/std_coverage.txt", header = TRUE)
bin_size <- read.table("./SAMPLES-SUMMARY/general_bins_summary.txt", header = TRUE)[, c(1, 3, 6, 9)]
total_reads <- read.table("./sample_reads.tsv", header = TRUE)
read_length <- 300
# From wide to long format
mean_coverage_long <- gather(mean_coverage, Sample_ID, coverage,
SAMPLE_16:SAMPLE_65, factor_key=TRUE)
std_coverage_long <- gather(std_coverage, Sample_ID, std_coverage,
SAMPLE_16:SAMPLE_65,
factor_key=TRUE)
coverage_data <- data.frame(mean_coverage_long,
std_coverage = std_coverage_long[,3])
# Read and add metadata
# meta <- read.csv2("metadata.csv")
# meta$Sample_ID <- gsub(meta$Sample_ID, pattern = ".", replacement = "_", fixed = TRUE)
data_total <- left_join(coverage_data, total_reads, by = "Sample_ID")
data_total <- left_join(data_total, bin_size, by = "bins")
# data_total <- left_join(data_total, meta, by = "Sample_ID")
data_total$bins <- plyr::revalue(data_total$bins, c("BetIa_bin"="Limnohabitans_bin", "bacIa_vizbin1"="Bacteroidetes_bin1",
"bacIa_vizbin2"="Bacteroidetes_bin2"))
# Calculate relative abundance of the bins
data_total$mean_rel_abundance <- 100*(data_total$coverage*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$upper_rel_abundance <- 100*((data_total$coverage+data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$lower_rel_abundance <- 100*((data_total$coverage-data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$mean_rel_abundance_map <- 100*(data_total$coverage*data_total$bin_size)/(read_length*data_total$Mapped_reads)
data_total$upper_rel_abundance_map <- 100*((data_total$coverage+data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Mapped_reads)
data_total$lower_rel_abundance_map <- 100*((data_total$coverage-data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Mapped_reads)
# Add additional column that assigns PNCs to correct MAG
data_total$Genome_id <- factor(rep(c("Limnohabitans", "Limnohabitans", "Limnohabitans", "Bacteroidetes MAG2", "Bacteroidetes MAG1", "Bacteroidetes MAG2"), 4))
# Plot genome size for all three genomes
data_total[data_total$bins %in% c("Bacteroidetes_bin1",
"Bacteroidetes_bin2","Limnohabitans_bin"), ] %>%
ggplot(aes(x = bins, y = bin_size/(4*1e6), fill = Genome_id))+
theme_bw()+
geom_bar(width = 0.5, stat="identity", alpha = 0.7)+
coord_flip()+
scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
theme(axis.text.x = element_text(angle = 0, size = 16),
axis.text.y = element_text(size = 16),
axis.title.x = element_text(size = 18),
legend.title = element_text("Genome bin"), legend.position = "top")+
xlab("")+
ylab("Genome size (Mbp)")+
guides(fill = FALSE)+
geom_hline(yintercept = 1, size = 2, linetype="dotted")# Plot irep for all 3 genomes
data_total[data_total$bins %in% c("Bacteroidetes_bin1",
"Bacteroidetes_bin2","Limnohabitans_bin"), ] %>%
ggplot(aes(x = bins, y = mean_irep/4, fill = Genome_id))+
theme_bw()+
geom_bar(width = 0.5, stat="identity", alpha = 0.7)+
coord_flip()+
scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
theme(axis.text.x = element_text(angle = 0, size = 16),
axis.text.y = element_text(size = 16),
axis.title.x = element_text(size = 18),
legend.title = element_text(""), legend.position = "top")+
xlab("")+
ylab("Index of Replication (iRep)")+
guides(fill = FALSE)+
geom_hline(yintercept = 1.34, size = 2, linetype="dotted")It is clear that there is significant %GC coverage bias present. The estimated relative abundances from metagenomics do not quantitatively match with the V3-V4 16S rRNA gene amplicon data. \[Relative\ abundance =100*(\frac{mean\ coverage * bin\ size}{read\ length*total\ sample\ reads })\] Another option is to calculate relative to mapped number of reads: \[Relative\ abundance =100*(\frac{mean\ coverage * bin\ size}{read\ length*total\ sample\ reads * \%mapped\ reads})\]
Import reference relative abundances from 16S data set in order to directly compare with metagenomic data set.
df_16S <- read.table("relative_abundance_16S.tsv", header = TRUE)
df_16S_long <- gather(df_16S, Sample_ID, relative_abundance_16S,
SAMPLE_16:SAMPLE_65, factor_key=TRUE)# Subset for only the three complete genomes (not PNCs).
data_total_sb <- data_total[data_total$bins %in% c("Limnohabitans_bin", "Bacteroidetes_bin1", "Bacteroidetes_bin2"),]
p_meta <- ggplot(data = data_total_sb, aes(x = bins, y = mean_rel_abundance, fill = bins))+
geom_point(size = 4, shape = 21, alpha = 0.7)+
scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
theme_bw()+
geom_errorbar(aes(ymin=lower_rel_abundance,
ymax=upper_rel_abundance,
width=.1))+
facet_grid(.~Sample_ID)+
# ylim(0,1)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.x=element_text(size=18))+
ylab("Mean relative abundance (%)")+
ylim(-1,100)+
ggtitle("Metagenomic - total reads")
# Corrected for mapped N° of reads
p_meta_mapped <- ggplot(data = data_total_sb, aes(x = bins, y = mean_rel_abundance_map, fill = bins))+
geom_point(size = 4, shape = 21, alpha = 0.7)+
scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
theme_bw()+
geom_errorbar(aes(ymin=lower_rel_abundance_map,
ymax=upper_rel_abundance_map,
width=.1))+
facet_grid(.~Sample_ID)+
# ylim(0,1)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.x=element_text(size=18))+
ylab("Mean relative abundance (%)")+
ylim(-1,100)+
ggtitle("Metagenomic - mapped reads")
p_16S <- ggplot(data = df_16S_long, aes(x = bins, y = relative_abundance_16S, fill = bins))+
geom_point(size = 4, shape = 21, alpha = 0.7)+
scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
theme_bw()+
facet_grid(.~Sample_ID)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.x=element_text(size=18))+
ylab("Mean relative abundance (%)")+
ylim(-1,100)+
ggtitle("V3-V4 16S")
grid_arrange_shared_legend(p_meta, p_meta_mapped, p_16S, ncol = 1, nrow = 3)# First we need the files that map the gene ID to the sequence ID (linux cmd: https://github.com/rprops/MetaG_lakeMI/wiki/11.-Genome-annotation)
# These are stored in the IMG_annotation data for each genome bin
# Next, extract the %GC of each gene from the gff file
extract_gc_from_gff("./IMG_annotation/IMG_2724679690_Limnohabitans/121950.assembled.gff", outputFolder = "GC_analysis")
extract_gc_from_gff("./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/121951.assembled.gff", outputFolder = "GC_analysis")
extract_gc_from_gff("./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/121960.assembled.gff", outputFolder = "GC_analysis")
# Use these files to make dataframes mapping function (COGs/Pfams/KO) and %GC
LIMNO_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.cog.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:29 2017 --- There are 2830 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017 --- The 10 genes with the highest GC% are:
## function_id function_name
## 2821 COG0405 Gamma-glutamyltranspeptidase
## 2822 COG2755 Lysophospholipase L1 or related esterase
## 2823 COG0642 Signal transduction histidine kinase
## 2824 COG1514 2'-5' RNA ligase
## 2825 COG1767 Triphosphoribosyl-dephospho-CoA synthetase
## 2826 COG1261 Flagella basal body P-ring formation protein FlgA
## 2827 COG0665 Glycine/D-amino acid oxidase (deaminating)
## 2828 COG0810 Periplasmic protein TonB, links inner and outer membranes
## 2829 COG1040 Predicted amidophosphoribosyltransferases
## 2830 COG1240 Mg-chelatase subunit ChlD
## GC
## 2821 78.5
## 2822 78.5
## 2823 78.5
## 2824 78.5
## 2825 78.6
## 2826 78.6
## 2827 79.1
## 2828 79.5
## 2829 79.7
## 2830 80.6
BAC1_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.cog.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:29 2017 --- There are 1889 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017 --- The 10 genes with the highest GC% are:
## function_id
## 1880 COG0052
## 1881 COG0183
## 1882 COG0629
## 1883 COG0509
## 1884 COG1734
## 1885 COG0636
## 1886 COG3502
## 1887 COG1765
## 1888 COG0377
## 1889 COG4104
## function_name
## 1880 Ribosomal protein S2
## 1881 Acetyl-CoA acetyltransferase
## 1882 Single-stranded DNA-binding protein
## 1883 Glycine cleavage system H protein (lipoate-binding)
## 1884 RNA polymerase-binding transcription factor DksA
## 1885 FoF1-type ATP synthase, membrane subunit c/Archaeal/vacuolar-type H+-ATPase, subunit K
## 1886 Uncharacterized conserved protein, DUF952 family
## 1887 Uncharacterized OsmC-related protein
## 1888 NADH:ubiquinone oxidoreductase 20 kD subunit (chhain B) or related Fe-S oxidoreductase
## 1889 Zn-binding Pro-Ala-Ala-Arg (PAAR) domain, incolved in TypeVI secretion
## GC
## 1880 51.6
## 1881 51.6
## 1882 52.0
## 1883 52.6
## 1884 52.7
## 1885 52.8
## 1886 52.9
## 1887 53.6
## 1888 53.6
## 1889 59.3
BAC2_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.cog.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:29 2017 --- There are 1797 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017 --- The 10 genes with the highest GC% are:
## function_id
## 1788 COG4675
## 1789 COG0636
## 1790 COG1501
## 1791 COG0725
## 1792 COG5434
## 1793 COG2115
## 1794 COG4225
## 1795 COG3669
## 1796 COG4588
## 1797 COG3258
## function_name
## 1788 Microcystin-dependent protein (function unknown)
## 1789 FoF1-type ATP synthase, membrane subunit c/Archaeal/vacuolar-type H+-ATPase, subunit K
## 1790 Alpha-glucosidase, glycosyl hydrolase family GH31
## 1791 ABC-type molybdate transport system, periplasmic component
## 1792 Polygalacturonase
## 1793 Xylose isomerase
## 1794 Rhamnogalacturonyl hydrolase YesR
## 1795 Alpha-L-fucosidase
## 1796 Accessory colonization factor AcfC, contains ABC-type periplasmic domain
## 1797 Cytochrome c
## GC
## 1788 44.5
## 1789 44.6
## 1790 44.9
## 1791 45.0
## 1792 45.1
## 1793 45.1
## 1794 45.4
## 1795 45.5
## 1796 46.6
## 1797 46.8
LIMNO_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:29 2017 --- There are 4954 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017 --- The 10 genes with the highest GC% are:
## function_id function_name GC
## 4945 pfam13202 EF-hand_5 79.0
## 4946 pfam16537 T2SSB 79.0
## 4947 pfam01266 DAO 79.1
## 4948 pfam03544 TonB_C 79.5
## 4949 pfam00156 Pribosyltran 79.7
## 4950 pfam11142 DUF2917 79.8
## 4951 pfam13318 DUF4089 79.8
## 4952 pfam13519 VWA_2 80.6
## 4953 pfam02120 Flg_hook 80.9
## 4954 pfam03023 MVIN 81.7
BAC1_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:29 2017 --- There are 3929 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017 --- The 10 genes with the highest GC% are:
## function_id function_name GC
## 3920 pfam02803 Thiolase_C 51.6
## 3921 pfam00436 SSB 52.0
## 3922 pfam00171 Aldedh 52.2
## 3923 pfam01597 GCV_H 52.6
## 3924 pfam01258 zf-dskA_traR 52.7
## 3925 pfam00137 ATP-synt_C 52.8
## 3926 pfam06108 DUF952 52.9
## 3927 pfam02566 OsmC 53.6
## 3928 pfam01058 Oxidored_q6 53.6
## 3929 pfam05488 PAAR_motif 59.3
BAC2_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:29 2017 --- There are 3573 genes with > 0.1 %
## Wed Nov 15 12:08:29 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:29 2017 --- The 10 genes with the highest GC% are:
## function_id function_name GC
## 3564 pfam13531 SBP_bac_11 46.6
## 3565 pfam13442 Cytochrome_CBB3 46.8
## 3566 pfam12779 YXWGXW 50.0
## 3567 pfam12779 YXWGXW 50.0
## 3568 pfam01391 Collagen 50.4
## 3569 pfam01391 Collagen 50.4
## 3570 pfam01391 Collagen 50.4
## 3571 pfam01391 Collagen 50.4
## 3572 pfam01391 Collagen 50.4
## 3573 pfam01391 Collagen 50.4
LIMNO_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.ko.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:30 2017 --- There are 2164 genes with > 0.1 %
## Wed Nov 15 12:08:30 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:30 2017 --- The 10 genes with the highest GC% are:
## function_id
## 2155 two-component system, OmpR family, sensor histidine kinase QseC [EC:2.7.13.3]
## 2156 2'-5' RNA ligase [EC:6.5.1.-]
## 2157 triphosphoribosyl-dephospho-CoA synthase [EC:2.4.2.52]
## 2158 flagella basal body P-ring formation protein FlgA
## 2159 general secretion pathway protein B
## 2160 tRNA 5-methylaminomethyl-2-thiouridine biosynthesis bifunctional protein [EC:2.1.1.61 1.5.-.-]
## 2161 tRNA 5-methylaminomethyl-2-thiouridine biosynthesis bifunctional protein [EC:2.1.1.61 1.5.-.-]
## 2162 4'-phosphopantetheinyl transferase [EC:2.7.8.-]
## 2163 magnesium chelatase subunit D [EC:6.6.1.1]
## 2164 flagellar hook-length control protein FliK
## function_name GC
## 2155 EC:2.7.13.3 78.5
## 2156 EC:6.5.1.- 78.5
## 2157 EC:2.4.2.52 78.6
## 2158 78.6
## 2159 79.0
## 2160 EC:1.5.- 79.1
## 2161 EC:2.1.1.61 79.1
## 2162 EC:2.7.8.- 79.1
## 2163 EC:6.6.1.1 80.6
## 2164 80.9
BAC1_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.ko.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:30 2017 --- There are 1384 genes with > 0.1 %
## Wed Nov 15 12:08:30 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:30 2017 --- The 10 genes with the highest GC% are:
## function_id function_name GC
## 1375 single-strand DNA-binding protein 51.3
## 1376 threonine aldolase [EC:4.1.2.5] EC:4.1.2.5 51.3
## 1377 large subunit ribosomal protein L32 51.3
## 1378 uncharacterized protein 51.5
## 1379 translation initiation factor IF-2 51.5
## 1380 small subunit ribosomal protein S2 51.6
## 1381 single-strand DNA-binding protein 52.0
## 1382 glycine cleavage system H protein 52.6
## 1383 F-type H+-transporting ATPase subunit c 52.8
## 1384 NADH-quinone oxidoreductase subunit B [EC:1.6.5.3] EC:1.6.5.3 53.6
BAC2_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.ko.tab.txt", gc_thresh = 0.1, output = FALSE)## Wed Nov 15 12:08:30 2017 --- There are 1342 genes with > 0.1 %
## Wed Nov 15 12:08:30 2017 --- This is 100 % of all genes
## Wed Nov 15 12:08:30 2017 --- The 10 genes with the highest GC% are:
## function_id function_name GC
## 1333 uncharacterized protein 43.9
## 1334 NADH-quinone oxidoreductase subunit B [EC:1.6.5.3] EC:1.6.5.3 44.3
## 1335 F-type H+-transporting ATPase subunit c 44.6
## 1336 alpha-D-xyloside xylohydrolase [EC:3.2.1.177] EC:3.2.1.177 44.9
## 1337 xylose isomerase [EC:5.3.1.5] EC:5.3.1.5 45.1
## 1338 alpha-L-fucosidase [EC:3.2.1.51] EC:3.2.1.51 45.4
## 1339 uncharacterized protein 45.5
## 1340 uncharacterized protein 45.8
## 1341 accessory colonization factor AcfC 46.6
## 1342 thiosulfate dehydrogenase [EC:1.8.2.2] EC:1.8.2.2 46.8
Motivation: For COGs there exists a hierarchy allowing us to investigate whether there is a conservation of high/low %GC in certain functional gene groups. In order to do this we need to incorporate this hierarchy into the genome dataframes we have now.
Here we use the dataframe made in the previous section to see if there is a significant difference in the gene length of the COGs within these three consensus genomes.
Observation: They have very small genes: on average < 500bp.
# Limnohabitans MAG gene length distribution
p_LIMNO_length <- easyGgplot2::ggplot2.histogram(data = LIMNO_gc_cog, xName = 'gene_length',
groupName = 'Genome', alpha = 0.5,
legendPosition = "top", binwidth = 0.15,
groupColors = col_limno,addMeanLine=TRUE, meanLineColor="black",
meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
ggtitle("Limnohabitans MAG")+ xlim(0,2000)
# Bacteroidetes MAG1 gene length distribution
p_BAC1_length <- easyGgplot2::ggplot2.histogram(data = BAC1_gc_cog, xName = 'gene_length',
groupName = 'Genome', alpha = 0.5,
legendPosition = "top", binwidth = 0.15,
groupColors = col_bac1,addMeanLine=TRUE, meanLineColor="black",
meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
ggtitle("Bacteroidetes MAG1")+ xlim(0,2000)
# Bacteroidetes MAG2 gene length distribution
p_BAC2_length <- easyGgplot2::ggplot2.histogram(data = BAC2_gc_cog, xName = 'gene_length',
groupName = 'Genome', alpha = 0.5,
legendPosition = "top", binwidth = 0.15,
groupColors = col_bac2,addMeanLine=TRUE, meanLineColor="black",
meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
ggtitle("Bacteroidetes MAG2")+ xlim(0,2000)
grid.arrange(p_LIMNO_length, p_BAC1_length, p_BAC2_length, ncol = 3)We can do the same for the Pfams.
# Limnohabitans MAG gene length distribution
p_LIMNO_length <- easyGgplot2::ggplot2.histogram(data = LIMNO_gc_pfam, xName = 'gene_length',
groupName = 'Genome', alpha = 0.5,
legendPosition = "top", binwidth = 0.15,
groupColors = col_limno,addMeanLine=TRUE, meanLineColor="black",
meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
ggtitle("Limnohabitans MAG")+ xlim(0,3000)
# Bacteroidetes MAG1 gene length distribution
p_BAC1_length <- easyGgplot2::ggplot2.histogram(data = BAC1_gc_pfam, xName = 'gene_length',
groupName = 'Genome', alpha = 0.5,
legendPosition = "top", binwidth = 0.15,
groupColors = col_bac1,addMeanLine=TRUE, meanLineColor="black",
meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
ggtitle("Bacteroidetes MAG1")+ xlim(0,3000)
# Bacteroidetes MAG2 gene length distribution
p_BAC2_length <- easyGgplot2::ggplot2.histogram(data = BAC2_gc_pfam, xName = 'gene_length',
groupName = 'Genome', alpha = 0.5,
legendPosition = "top", binwidth = 0.15,
groupColors = col_bac2,addMeanLine=TRUE, meanLineColor="black",
meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
ggtitle("Bacteroidetes MAG2")+ xlim(0,3000)
grid.arrange(p_LIMNO_length, p_BAC1_length, p_BAC2_length, ncol = 3)# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG1
unique_LIMNO_BAC1 <- dplyr::anti_join(LIMNO_gc_cog, BAC1_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_LIMNO_BAC1)), "unique COGs in Limnohabitans MAG vs Bacteroidetes MAG1")## There are 1178 unique COGs in Limnohabitans MAG vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG2
unique_LIMNO_BAC2 <- dplyr::anti_join(LIMNO_gc_cog, BAC2_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_LIMNO_BAC2)), "unique COGs in Limnohabitans MAG vs Bacteroidetes MAG2")## There are 1143 unique COGs in Limnohabitans MAG vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_BAC1_BAC2 <- dplyr::anti_join(BAC1_gc_cog, BAC2_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_BAC1_BAC2)), "unique COGs in Bacteroidetes MAG1 vs Bacteroidetes MAG2")## There are 153 unique COGs in Bacteroidetes MAG1 vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_BAC2_BAC1 <- dplyr::anti_join(BAC2_gc_cog, BAC1_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_BAC2_BAC1)), "unique COGs in Bacteroidetes MAG2 vs Bacteroidetes MAG1")## There are 143 unique COGs in Bacteroidetes MAG2 vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG1
unique_pfam_LIMNO_BAC1 <- dplyr::anti_join(LIMNO_gc_pfam, BAC1_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_LIMNO_BAC1)), "unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG1")## There are 1474 unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG2
unique_pfam_LIMNO_BAC2 <- dplyr::anti_join(LIMNO_gc_pfam, BAC2_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_LIMNO_BAC2)), "unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG2")## There are 1424 unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_pfam_BAC1_BAC2 <- dplyr::anti_join(BAC1_gc_pfam, BAC2_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_BAC1_BAC2)), "unique Pfams in Bacteroidetes MAG1 vs Bacteroidetes MAG2")## There are 285 unique Pfams in Bacteroidetes MAG1 vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_pfam_BAC2_BAC1 <- dplyr::anti_join(BAC2_gc_pfam, BAC1_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_BAC2_BAC1)), "unique Pfams in Bacteroidetes MAG2 vs Bacteroidetes MAG1")## There are 233 unique Pfams in Bacteroidetes MAG2 vs Bacteroidetes MAG1
Get COG ID to COG functional category mapping file here: ftp://ftp.ncbi.nih.gov/pub/wolf/COGs/COG0303/cogs.csv
The exact statistical analysis to compare genomes based on these profiles should be performed in STAMP.
# Import COG mapping file
cogid_2_cogcat <- read.csv("./Mapping_files/cogid_2_cogcat.csv", sep = ",", header = FALSE, fill = TRUE,col.names = c("COG_ID", "COG_class", "function"))[, 1:2]
cogid_2_cogcat <- cogid_2_cogcat[(cogid_2_cogcat$COG_class)!="", ]
cogid_2_cogcat <- droplevels(cogid_2_cogcat)
# Read COG category file
cog_categories <- read.table("./Mapping_files/cog_categories.tsv", header = TRUE, sep = "\t")
# Merge COG metadata
cog_meta <- dplyr::left_join(cog_categories, cogid_2_cogcat, by = c("COG_class" = "COG_class"))
cog_meta <- droplevels(cog_meta)
# Merge genome information of all genome bins
merged_gc_cog <- rbind(LIMNO_gc_cog, BAC1_gc_cog, BAC2_gc_cog)
merged_gc_cog <- data.frame(merged_gc_cog, genome_id = c(rep("Limnohabitans MAG", nrow(LIMNO_gc_cog)),
rep("Bacteroidetes MAG1", nrow(BAC1_gc_cog)),
rep("Bacteroidetes MAG2", nrow(BAC2_gc_cog)))
)
# Merge this metadata with the genome data from before
# COGs with multiple classifications are currently still NA - work on this.
merged_gc_cog <- dplyr::left_join(merged_gc_cog, cog_meta, by = c("cog_id" = "COG_ID"))
merged_gc_cog <- merged_gc_cog[!is.na(merged_gc_cog$COG_functional_category),]
# Visualize distribution across major metabolism functional COG groups per genome.
p_cog_func_group <- ggplot(data = merged_gc_cog,
aes(x=COG_functional_category, fill = COG_functional_cluster))+
geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
theme_bw()+
facet_grid(genome_id~.)+
scale_fill_brewer(palette = "Accent")+
labs(x = "Gene length (bp)", y = "Count")+
theme(legend.position="bottom", axis.text.x = element_text(angle = 90, hjust = 1),
legend.text = element_text(size = 5))+
guides(fill=guide_legend(nrow=2,byrow=TRUE))
print(p_cog_func_group)p_cog_func_clust <- ggplot(data = merged_gc_cog,
aes(x=COG_functional_cluster, fill = COG_functional_cluster))+
geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
theme_bw()+
facet_grid(genome_id~.)+
scale_fill_brewer(palette = "Accent")+
labs(x = "Gene length (bp)", y = "Count")+
theme(legend.position="bottom",axis.text.x = element_text(angle = 90, hjust = 1),
legend.text = element_text(size = 5))+
guides(fill=guide_legend(nrow=2,byrow=TRUE))
print(p_cog_func_clust)# Import data
ko_path_df <- read.table("./Mapping_files/ko00000.keg", header = FALSE, sep = ";",
skip = 3, quote = "", fill = TRUE,
col.names = c("Level", "KO", "Function_abbrev", "Function_spec"))
ko_path_df <- ko_path_df[1:(nrow(ko_path_df)-1), ] # remove tailing "!" at the end of file
# Remove empty rows
ko_path_df$KO <- as.character(ko_path_df$KO); ko_path_df$Level <- as.character( ko_path_df$Level)
ko_path_df$KO[grep("A",ko_path_df$Level)] <- ko_path_df$Level[grep("A",ko_path_df$Level)]
ko_path_df$Level[grep("A",ko_path_df$Level)] <- "A"
ko_path_df <- ko_path_df[!ko_path_df$KO == "", ]
# Extract pathways from dataframe and add them as new factor column
# pathw_hier <- data.frame(Level = ko_path_df$Level[ko_path_df$Level %in% c("A","B","C")],
# Name = ko_path_df$KO[ko_path_df$Level %in% c("A","B","C")]
# )
# pathw_hier$Name <- as.character(pathw_hier$Name)
ko_path_df <- data.frame(ko_path_df, level_A = "A", level_B = "B", level_C = "C",
stringsAsFactors = FALSE)
ko_path_df$KO <- as.character(ko_path_df$KO)
# Get positions where to replicate higher hierarichcal level
pos_A <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "A"], nrow(ko_path_df)+1)
pos_B <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "B"], nrow(ko_path_df)+1)
pos_C <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "C"], nrow(ko_path_df)+1)
for(i in 1:(length(pos_A)-1)){
ko_path_df$level_A[pos_A[i]:(pos_A[i+1]-1)] <- ko_path_df$KO[pos_A[i]]
}
for(i in 1:(length(pos_B)-1)){
ko_path_df$level_B[pos_B[i]:(pos_B[i+1]-1)] <- ko_path_df$KO[pos_B[i]]
}
for(i in 1:(length(pos_C)-1)){
ko_path_df$level_C[pos_C[i]:(pos_C[i+1]-1)] <- ko_path_df$KO[pos_C[i]]
}
# Remove all rows with level A, B, C - and level column
ko_path_df <- ko_path_df[!ko_path_df$Level %in% c("A","B", "C"), ]
ko_path_df$level_A <- gsub(ko_path_df$level_A, pattern = "<b>|</b>", replacement = "")
ko_path_df$level_B <- gsub(ko_path_df$level_B, pattern = "<b>|</b>", replacement = "")
ko_path_df <- ko_path_df[, -1]
# Annotate merged ko file
merged_gc_ko <- rbind(LIMNO_gc_KO, BAC1_gc_KO, BAC2_gc_KO)
merged_gc_ko <- data.frame(merged_gc_ko,
genome_id = c(rep("Limnohabitans MAG", nrow(LIMNO_gc_KO)),
rep("Bacteroidetes MAG1", nrow(BAC1_gc_KO)),
rep("Bacteroidetes MAG2", nrow(BAC2_gc_KO)))
)
merged_gc_ko$ko_id <- gsub(merged_gc_ko$ko_id, pattern = "KO:", replacement = "")
merged_gc_ko <- dplyr::left_join(merged_gc_ko, ko_path_df[, c(1, 4:6)], by = c("ko_id" = "KO"))
# Fill up NA slots with "Unknown" pathway
merged_gc_ko$level_A[is.na(merged_gc_ko$level_A)] <- "Unknown"
merged_gc_ko$level_B[is.na(merged_gc_ko$level_B)] <- "Unknown"
merged_gc_ko$level_C[is.na(merged_gc_ko$level_C)] <- "Unknown"Get CodonO for linux here: http://sysbio.cvm.msstate.edu/software/CodonO/download
CU.linux input.genes.fnaOutput from CodonO:
inputfile.ok —- the SCUO units for each input sequence based on its order
inputfile.fik —- the composition ratio of the i-th amino acid in the k-th sequence
inputfile.hijk —- the frequency of the j-th degenerate codon for amino acid i in each sequence
This output was not sufficient to perform any analysis. Therefore I used the web browser to calculate the synonymous codon bias in the genes of the three genomes.
** Codon bias is proportional to mRNA production (Wan et al 2004)
** However, strong correlation between SCUO and %GC has been reported, thereby confounding possible “biological” effects.. Beware..
# Import and format data for Limnohabitans MAG
SCUO_LIMNO <- read.table("./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.genes.fna.codonO.output",
sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE
)
SCUO_LIMNO$V1 <- gsub(SCUO_LIMNO$V1, pattern = "\t", replacement = "")
Gene_LIMNO <- do.call(rbind, strsplit(SCUO_LIMNO$V1[grep("Ga0*", SCUO_LIMNO$V1)], " "))[,2]
SCUO_LIMNO$V1 <- gsub(SCUO_LIMNO$V1, pattern = " ", replacement = "")
SCUO_LIMNO <- data.frame(GC = SCUO_LIMNO$V1[grep(x = SCUO_LIMNO$V1, pattern = "GC*.*=")],
SCUO = rep(SCUO_LIMNO$V1[grep(x = SCUO_LIMNO$V1, pattern = "SCUO*")], each = 4),
Gene = rep(Gene_LIMNO, each = 4)
)
SCUO_LIMNO$GC <- as.numeric(gsub(SCUO_LIMNO$GC, pattern = ".*=", replacement = ""))
SCUO_LIMNO$SCUO <- as.numeric(gsub(SCUO_LIMNO$SCUO, pattern = ".*=", replacement = ""))
# Import and format data for Bacteroidetes MAG1
SCUO_BAC1 <- read.table("./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691.genes.fna.codonO.output",
sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE)
SCUO_BAC1$V1 <- gsub(SCUO_BAC1$V1, pattern = "\t", replacement = "")
Gene_BAC1 <- do.call(rbind, strsplit(SCUO_BAC1$V1[grep("Ga0*", SCUO_BAC1$V1)], " "))[,2]
SCUO_BAC1$V1 <- gsub(SCUO_BAC1$V1, pattern = " ", replacement = "")
SCUO_BAC1 <- data.frame(GC = SCUO_BAC1$V1[grep(x = SCUO_BAC1$V1, pattern = "GC*.*=")],
SCUO = rep(SCUO_BAC1$V1[grep(x = SCUO_BAC1$V1, pattern = "SCUO*")], each = 4),
Gene = rep(Gene_BAC1, each = 4)
)
SCUO_BAC1$GC <- as.numeric(gsub(SCUO_BAC1$GC, pattern = ".*=", replacement = ""))
SCUO_BAC1$SCUO <- as.numeric(gsub(SCUO_BAC1$SCUO, pattern = ".*=", replacement = ""))
# Import and format data for Bacteroidetes MAG2
SCUO_BAC2 <- read.table("./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.genes.fna.codonO.output",
sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE)
SCUO_BAC2$V1 <- gsub(SCUO_BAC2$V1, pattern = "\t", replacement = "")
Gene_BAC2 <- do.call(rbind, strsplit(SCUO_BAC2$V1[grep("Ga0*", SCUO_BAC2$V1)], " "))[,2]
SCUO_BAC2$V1 <- gsub(SCUO_BAC2$V1, pattern = " ", replacement = "")
SCUO_BAC2 <- data.frame(GC = SCUO_BAC2$V1[grep(x = SCUO_BAC2$V1, pattern = "GC*.*=")],
SCUO = rep(SCUO_BAC2$V1[grep(x = SCUO_BAC2$V1, pattern = "SCUO*")], each = 4),
Gene = rep(Gene_BAC2, each = 4)
)
SCUO_BAC2$GC <- as.numeric(gsub(SCUO_BAC2$GC, pattern = ".*=", replacement = ""))
SCUO_BAC2$SCUO <- as.numeric(gsub(SCUO_BAC2$SCUO, pattern = ".*=", replacement = ""))
# Merge data to one dataframe
SCUO_merged_gen <- data.frame(rbind(SCUO_LIMNO, SCUO_BAC1, SCUO_BAC2),
Genome_ID = c(rep("Limnohabitans MAG", nrow(SCUO_LIMNO)), rep("Bacteroidetes MAG1", nrow(SCUO_BAC1)),
rep("Bacteroidetes MAG2", nrow(SCUO_BAC2))),
GCx = rep(c("GC_mean", "GC1", "GC2", "GC3"),
(nrow(SCUO_LIMNO) + nrow(SCUO_BAC1) + nrow(SCUO_BAC2))/4
)
)
# Merge codon bias data with KO pathway annotation
SCUO_merged <- dplyr::left_join(SCUO_merged_gen, merged_gc_ko[, c(1:2,4 ,14:21)], by = c("Gene" = "contig_geneID"))
# Visualize differences in codon bias
p_SCUO.1 <- ggplot(data = SCUO_merged, aes (x = 100*GC, y = SCUO, fill = Genome_ID))+
geom_point(size = 4, shape = 21, alpha = 0.7)+
scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
theme_bw()+
facet_wrap(~GCx, ncol = 2)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 0, hjust = 1),
strip.text.x=element_text(size=18),
legend.position = "bottom")+
ylab("SCUO")+
xlab("%GC")+
ylim(0,1)
print(p_SCUO.1)# Subset to genes for which ko annotation is available
SCUO_merged_sb <- SCUO_merged[!is.na(SCUO_merged$level_A), ]
SCUO_merged_sb <- SCUO_merged_sb[SCUO_merged_sb$GCx == "GC_mean", ]
# Look at pathways enriched in high %GC
# SCUO_merged_sb[]
SCUO_merged_gen_gcmean <- SCUO_merged_gen %>% dplyr::filter(GCx == "GC_mean")
p_SCUO.2 <- ggplot(data = SCUO_merged_gen_gcmean, aes (x = Genome_ID, y = SCUO))+
geom_jitter(size = 3, alpha = 0.3, shape = 21, aes(fill = Genome_ID))+
geom_boxplot(alpha=0, size =1.5, color = "darkorange")+
# scale_fill_brewer(palette = "Accent")+
scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
theme_bw()+
# facet_wrap(Genome_ID~GCx)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.x=element_text(size=18))+
ylab("Codon bias - SCUO")+
xlab("")+
ylim(0,1)+
guides(fill=FALSE)+
scale_x_discrete(labels=c("Bacteroidetes MAG1" = paste("Bacteroidetes MAG1 (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[1],")", sep = ""),
"Bacteroidetes MAG2" = paste("Bacteroidetes MAG2 (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[2],")", sep = ""),
"Limnohabitans MAG" = paste("Limnohabitans MAG (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[3],")", sep = ""))
)
#
print(p_SCUO.2)tmp <- SCUO_merged_sb$genome_id
tmp2 <- cbind(SCUO_merged_sb$ko_id,
c(rep(col_limno, table(tmp)[3]), rep(col_bac1, table(tmp)[1]), rep(col_bac2, table(tmp)[2]))
)
write.table(tmp2, file = "All_KO.tsv", quote = FALSE,
col.names = FALSE, row.names = FALSE)# Import codonO results of reference genomes
ref_SCUO <- codonO_2_df(pathx = "./IMG_annotation/References/codonO_output/")
# reimport codonO results of Limno genome
Ramli_SCUO <- codonO_2_df(pathx = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/",
patternx = "codonO")
# Merge dataframes
ref_limno_SCUO <- rbind(ref_SCUO, Ramli_SCUO)
# Replace genome names by better annotated names
map_scuo <- read.delim("./Mapping_files/codonO_ref_names.tsv", stringsAsFactors = FALSE)
ref_limno_SCUO$Genome <- as.character(ref_limno_SCUO$Genome)
for(i in 1:nrow(map_scuo)){
ref_limno_SCUO$Genome[ref_limno_SCUO$Genome %in% map_scuo$codon_file[i]] <- map_scuo$ref_name[i]
}
ref_limno_SCUO$Genome[ref_limno_SCUO$Genome %in% "2724679690.genes.fna.codonO.output"] <- "Ramli. sp. MAG"
# order data according to mean SCUO
# sum_scuo <- ref_limno_SCUO %>% group_by(Genome) %>% summarize(mean_scuo = mean(SCUO))
# ref_limno_SCUO$Genome <- factor(ref_limno_SCUO$Genome, levels = sum_scuo$Genome[order(sum_scuo$mean_scuo)])
ord_list_bin <- c("Lim. sp. Rim11", "Lim. sp. 103DPR2",
"Lim. sp. 2KL-27", "Lim. sp. Rim47",
"Lim. sp. II-D5", "Lim. sp. 2KL-3",
"Rhodo. sp. ED16","Rhodo. sp. T118",
"Curvi. sp. ATCC", "Curvi. sp. PAE-UM",
"Vario. sp. 110B", "Vario. sp. EPS",
"Ramli. sp. Leaf400", "Ramli. sp. TTB310",
"Ramli. sp. MAG",
"Ramli. sp. 5-10"
)
# order rows and columns
ref_limno_SCUO$Genome <- factor(ref_limno_SCUO$Genome, levels = ord_list_bin)
# ref_limno_SCUO$new <- ref_limno_SCUO$Genome=="Lim. MAG (2724679690)"
# Plot SCUO profiles
p_ramli_SCUO <- ref_limno_SCUO %>% dplyr::filter(GCx == "GC_mean") %>%
ggplot(aes(x = Genome, y = SCUO, fill = Genome))+
theme_bw()+
# geom_rect(data = tp, aes(fill = new), xmin = -Inf, xmax = Inf,
# ymin = -Inf,ymax = Inf, alpha = 0.005, show.legend =FALSE, inherit.aes = FALSE)+
geom_violin(alpha = 0.4, adjust = 1, draw_quantiles = TRUE)+
scale_fill_manual(values = c(rep(adjustcolor("#c8c8ff",0.8),6), rep("#f8cf94",2),
rep("#adf7ad",2), rep(adjustcolor("#000000",0.21),2),
rep("#e2a2fd",4)))+
theme(axis.text=element_text(size=13), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
# axis.text.x = element_text(angle = 65, hjust = 1),
strip.text.x=element_text(size=18),
legend.position="bottom",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
guides(fill=FALSE)+
stat_summary(fun.data=mean_sdl, fun.args = list(mult = 1),
geom="pointrange", color="black")+
xlab("")+
ylab("")+
# geom_vline(xintercept = 9.5, col = 'black', lwd = 2, linetype = 1, alpha = 0.6)+
ylim(0,1)
p_ramli_SCUOp_ramli_GC <- ref_limno_SCUO %>% dplyr::filter(GCx == "GC_mean") %>%
ggplot(aes(x = Genome, y = GC, fill = Genome))+
theme_bw()+
geom_hline(yintercept = 0.7, col = 'black', lwd = 1, linetype = 2, alpha = 0.6)+
# geom_rect(data = tp, aes(fill = new), xmin = -Inf, xmax = Inf,
# ymin = -Inf,ymax = Inf, alpha = 0.005, show.legend =FALSE, inherit.aes = FALSE)+
geom_violin(alpha = 0.4, adjust = 1, draw_quantiles = TRUE)+
scale_fill_manual(values = c(rep(adjustcolor("#c8c8ff",0.8),6), rep("#f8cf94",2),
rep("#adf7ad",2), rep(adjustcolor("#000000",0.21),2),
rep("#e2a2fd",4)))+
theme(axis.text=element_text(size=13), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
# axis.text.x = element_text(angle = 65, hjust = 1),
strip.text.x=element_text(size=18),
legend.position="bottom",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
guides(fill=FALSE)+
stat_summary(fun.data=mean_sdl, fun.args = list(mult = 1),
geom="pointrange", color="black")+
xlab("")+
ylab("")+
scale_y_continuous(labels = function(x) sprintf("%.2f", x), breaks = seq(0.20,0.90,0.10),
limits = c(0.2,0.9))
# scale_y_continuous(breaks = seq(0.20,0.90,0.10), limits = c(0.2,0.9))
p_ramli_GC# Format file for annotation of phylogenetic treeUnrooted phylogenomic tree used in codeML analysis. Green branch indicates the genome for which PSG were tested.
Alternatively, the entire Ramlibacter clade can be tested for PSGs: Green branch indicates the clade for which the LCA was tested for PSGs.
PosiGene also tries to minimize false positives/negatives through additional filtering:
This filtering step can also be seen as an instrument to reduce false negatives. Few badly conserved sequences can force the first mentioned filter to delete large parts of the MSA reducing the power of the test and potentially removing positively selected sites. Third, entire MSAs can be discarded if they are considered unreliable for the following reasons, if: (i) a small absolute number or a small percentage of alignment columns or anchor species codons remain after the first filtering step, (ii) few sequences remain after the second filtering step, (iii) disproportional dN/dS ratios (e.g. ≥100 in foreground branch) were calculated by CODEML or (iv) an implausibly high fraction of positively selected sites was inferred. Additionally, MSAs will only be considered if at least one species from the sister taxon (i.e. the most closely-related species/clade) of the examined branch is represented in it. Without this condition it is not possible to say whether potentially observed selective pressure worked on the branch of interest or before in evolution .
# Import results file of genome-specific PSGs
data_posi <- read.table("./posigene_analysis/result_tables/Ramlibacter_MAG_results.tsv", header = TRUE, fill = TRUE, sep = "\t")[, c("Transcript", "P.Value","FDR", "HA.foreground.omega",
"Number.of.Sites.under.positive.Selection")]
# Import results file of Ramlibacter clade-specific PSGs
data_posi_clade <- read.table("./posigene_analysis/result_tables_clade/Ramlibacter_MAG_results.tsv", header = TRUE, fill = TRUE, sep = "\t")[, c("Transcript", "P.Value","FDR", "HA.foreground.omega",
"Number.of.Sites.under.positive.Selection")]
colnames(data_posi)[1] <- "Gene"; colnames(data_posi_clade)[1] <- "Gene"
data_posi$Gene <- as.character(data_posi$Gene)
data_posi_clade$Gene <- as.character(data_posi_clade$Gene)
# Import mapping file to link gene IDs from PosiGene to
# those used by the IMG annotation (both are in headers from .genes.fna)
map_posi <- read.table("./Mapping_files/posiG_mapping.tsv")
colnames(map_posi) <- c("posi_geneID", "IMG_geneID")
map_posi$posi_geneID <- as.character(map_posi$posi_geneID)
# Filter out the genes that are under positive selection
# Taking threshold of adjusted p.value of 0.05 and FDR < 0.05
# Also remove dN/dS ratios of less than 15.
data_posi <- data_posi %>% filter(P.Value < 0.05 &
FDR < 0.05 & HA.foreground.omega < 10)
data_posi_clade <- data_posi_clade %>% filter(P.Value < 0.05 &
FDR < 0.05 & HA.foreground.omega < 10)
# data_posi <- data_posi %>% filter(P.Value < 0.05 &
# FDR < 0.05 & HA.foreground.omega < 10 &
# Number.of.Sites.under.positive.Selection > 10)
# Report summary
cat(paste("There are ", nrow(data_posi), " genes under positive selection in this MAG (P<0.05). This is ",round(100*nrow(data_posi)/nrow(map_posi),1), "% of all genes", sep = "")
)## There are 415 genes under positive selection in this MAG (P<0.05). This is 10.8% of all genes
cat(paste("There are ", nrow(data_posi_clade), " genes under positive selection in the Ramlibacter clade (P<0.05). This is ", round(100*nrow(data_posi_clade)/nrow(map_posi),1), "% of all genes in the Ramlibacter sp. MAG which was used as reference/anchor species", sep = "")
)## There are 184 genes under positive selection in the Ramlibacter clade (P<0.05). This is 4.8% of all genes in the Ramlibacter sp. MAG which was used as reference/anchor species
# Merge this data with the functional annotation (e.g. KO) of these genes
data_posi <- left_join(data_posi, map_posi, by = c("Gene" = "posi_geneID"))
data_posi_clade <- left_join(data_posi_clade, map_posi, by = c("Gene" = "posi_geneID"))
data_posi_KO <- left_join(data_posi, merged_gc_ko, by = c("IMG_geneID" = "contig_geneID"))
data_posi_clade_KO <- left_join(data_posi_clade, merged_gc_ko, by = c("IMG_geneID" = "contig_geneID"))
# Retain clade or MAG only genes in the respective dataframes
pos <- !data_posi_KO$Gene %in% data_posi_clade_KO$Gene
pos2 <- !data_posi_clade_KO$Gene %in% data_posi_KO$Gene
pos3 <- data_posi_clade_KO$Gene %in% data_posi_KO$Gene
data_posi_KO <- data_posi_KO[pos, ]
data_posi_clade_KO <- data_posi_clade_KO[pos2, ]
data_posi_clade_MAG_KO <- data_posi_clade_KO[pos3, ]
# Optional: write table for quick view in iPath v2
write.table(file = "KO_posiG.tsv", unique(data_posi_KO$ko_id), quote = FALSE,
row.names = FALSE, col.names = FALSE)
write.table(file = "KO_posiG_clade.tsv", unique(data_posi_clade_KO$ko_id), quote = FALSE,
row.names = FALSE, col.names = FALSE)
# cat("Genes under positive selection with KO annotation")
# Merge dataframes to plot
# Remove levels without "n_level" number of genes
n_level <- round(0.01*sum(table(data_posi_clade_KO$level_C)),0)
posiG_p_df_clade <- table(data_posi_clade_KO$level_C)[table(data_posi_clade_KO$level_C)>n_level]
posiG_p_df_clade <- data.frame(posiG_p_df_clade); posiG_p_df_clade$Var1 <- as.character(posiG_p_df_clade$Var1)
n_level <- round(0.01*sum(table(data_posi_KO$level_C)),0)
posiG_p_df_MAG <- table(data_posi_KO$level_C)[table(data_posi_KO$level_C)>n_level]
posiG_p_df_MAG <- data.frame(posiG_p_df_MAG); posiG_p_df_MAG$Var1 <- as.character(posiG_p_df_MAG$Var1)
n_level <- round(0.01*sum(table(data_posi_clade_MAG_KO$level_C)),0)
posiG_p_df_MAG_clade <- table(data_posi_clade_MAG_KO$level_C)[table(data_posi_clade_MAG_KO$level_C)>n_level]
posiG_p_df_MAG_clade <- data.frame(posiG_p_df_MAG_clade); posiG_p_df_MAG_clade$Var1 <- as.character(posiG_p_df_MAG_clade$Var1)
# Merge dataframes
posiG_p_df_merged <- data.frame(rbind(posiG_p_df_clade, posiG_p_df_MAG,
posiG_p_df_MAG_clade),
branch = factor(c(rep("clade", nrow(posiG_p_df_clade)),
rep("MAG", nrow(posiG_p_df_MAG)),
rep("clade+MAG", nrow(posiG_p_df_MAG_clade))),
levels = c("MAG", "clade", "clade+MAG"))
)
data_posi_KO_merge <- rbind(data_posi_KO, data_posi_clade_KO, data_posi_clade_MAG_KO)
# Merge with level B annotation
posiG_p_df_merged <- left_join(posiG_p_df_merged, data_posi_KO_merge[, c("level_A","level_B","level_C")],
by = c("Var1" = "level_C")) %>% distinct()
posiG_p_df_merged$level_B <- substring(posiG_p_df_merged$level_B, 7)
posiG_p_df_merged$level_A <- substring(posiG_p_df_merged$level_A, 7)
posiG_p_df_merged$Var1 <- substring(posiG_p_df_merged$Var1, 7)
posiG_p_df_merged$Var1 <- gsub("\\[[^\\]]*\\]", "", posiG_p_df_merged$Var1 , perl=TRUE)
posiG_p_df_merged$level_B[posiG_p_df_merged$level_B == "Cellular community - prokaryotes"] <- "Biofilm formation & quorum sensing"
# Sort according to frequency
posiG_p_df_merged$Var1 <- factor(posiG_p_df_merged$Var1, levels = unique(posiG_p_df_merged$Var1[rev(order(posiG_p_df_merged$Freq))]))
posiG_p_df_merged$level_B <- factor(posiG_p_df_merged$level_B, levels = unique(posiG_p_df_merged$level_B[rev(order(posiG_p_df_merged$Freq))]))
# Add extra column to shorten number of labels in legend
# Only show 12 most frequent categories
posiG_p_df_merged <- posiG_p_df_merged %>% mutate(level_C_short =
Var1)
tmp <- levels(posiG_p_df_merged$Var1)[1:11]
posiG_p_df_merged$level_C_short <- as.character(posiG_p_df_merged$level_C_short)
posiG_p_df_merged$level_C_short[!posiG_p_df_merged$level_C_short %in% tmp] <- "Other"
posiG_p_df_merged$level_C_short <- factor(posiG_p_df_merged$level_C_short,
levels = c(tmp,"Other"))
# Make plot
p_KO_posi <- ggplot(posiG_p_df_merged, aes(x = level_B, y = Freq, fill = level_C_short))+
geom_bar(stat="identity", color = "black")+
theme_bw()+
scale_fill_brewer(palette="Paired")+
ggtitle("Number of genes")+
ylab("") + xlab("")+
facet_grid(branch~.)+
theme(axis.text=element_text(size=12.5), axis.title=element_text(18),
title=element_text(size=18), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 65, hjust = 1),
strip.text=element_text(size=18),
plot.margin = unit(c(1,1,1,1), "cm"), legend.title = element_blank()
# ,legend.position = c(0.87, 0.85)
)
print(p_KO_posi)# Correlate genes under positive selection with their GC content
data_SCUO_posi <- SCUO_merged_gen_gcmean %>% filter(Genome_ID == "Limnohabitans MAG")
data_SCUO_posi <- droplevels(data_SCUO_posi)
data_SCUO_posi$posi_select <- data_SCUO_posi$Gene %in% data_posi$IMG_geneID
data_SCUO_posi$posi_select <- factor(data_SCUO_posi$posi_select, levels = c("FALSE", "TRUE"))
# merge with dN/dS ratio data
data_SCUO_posi <- left_join(data_SCUO_posi,
data_posi, by = c("Gene" = "IMG_geneID"))
data_SCUO_posi_only <- data_SCUO_posi %>% filter(posi_select == "TRUE")
# Selected annotation level (KO)
selected_KOlevels <- c("Signal transduction",
"Amino acid metabolism",
"Metabolism of other amino acids",
"Metabolism of cofactors and vitamins",
"Carbohydrate metabolism",
"Energy metabolism",
"Translation",
"Membrane transport",
"Biosynthesis of other secondary metabolites",
"Lipid metabolism",
"Membrane transport",
"Unknown"
)
# Extract level_C labels of genes in top 5 dN/ds ratio
# First filter out the genes with multiple annotations by
# selecting the annotation with the highest identity.
# data_posi_KO_unique <- data_posi_KO %>% group_by(Gene) %>%
# filter(percent_identity == max(percent_identity))
top <- 10
thresh <- sort(data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$level_C
selected_KOlevels_labels[data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""
# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *",
replacement = "", selected_KOlevels_labels)
# Remove everything before first space
selected_KOlevels_labels <- sub(".*? (.+)", "\\1", selected_KOlevels_labels)
p_SCUO_posi1 <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ] %>%
ggplot(aes(x = level_B, y = HA.foreground.omega,
fill = level_B))+
# geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
outlier.shape = NA)+
scale_fill_brewer(palette = "Set3")+
# scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
# scale_size_manual(values = c(2.5,4))+
theme_bw()+
# facet_wrap(Genome_ID~GCx)+
theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 60, hjust = 1),
strip.text.x=element_text(size=18),
legend.position = "bottom")+
xlab("")+
ylab(expression(dN/dS))+
ylim(0,20)+
guides(size = FALSE, fill = FALSE)+
geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
box.padding = 0.35, point.padding = 0.5,
segment.color = "#3F4656",
size = 3,
# Width of the line segments.
segment.size = 1,
# Draw an arrow from the label to the data point.
arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
nudge_x = -0.1,
nudge_y = 0.6,
force = 10
)
p_SCUO_posi1# Extract level_C labels of genes in top 5 dN/ds ratio
top <- 10
thresh <- sort(data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$ko_name
selected_KOlevels_labels[data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""
# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *",
replacement = "", selected_KOlevels_labels)
p_SCUO_posi2 <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ] %>%
ggplot(aes(x = level_B, y = HA.foreground.omega,
fill = level_B))+
# geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
outlier.shape = NA)+
scale_fill_brewer(palette = "Set3")+
# scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
# scale_size_manual(values = c(2.5,4))+
theme_bw()+
# facet_wrap(Genome_ID~GCx)+
theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 60, hjust = 1),
strip.text.x=element_text(size=18),
legend.position = "bottom")+
xlab("")+
ylab(expression(dN/dS))+
ylim(0,20)+
guides(size = FALSE, fill = FALSE)+
geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
box.padding = 0.35, point.padding = 0.5,
segment.color = "#3F4656",
size = 3,
# Width of the line segments.
segment.size = 1,
# Draw an arrow from the label to the data point.
arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
nudge_x = -0.1,
nudge_y = 0.6,
force = 10
)
p_SCUO_posi2# Do the same thing for the Ramlibacter clade-level PSGs
top <- 10
thresh <- sort(data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$level_C
selected_KOlevels_labels[data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""
# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *",
replacement = "", selected_KOlevels_labels)
# Remove everything before first space
selected_KOlevels_labels <- sub(".*? (.+)", "\\1", selected_KOlevels_labels)
p_SCUO_posi3 <- data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ] %>%
ggplot(aes(x = level_B, y = HA.foreground.omega,
fill = level_B))+
# geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
outlier.shape = NA)+
scale_fill_brewer(palette = "Set3")+
# scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
# scale_size_manual(values = c(2.5,4))+
theme_bw()+
# facet_wrap(Genome_ID~GCx)+
theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 60, hjust = 1),
strip.text.x=element_text(size=18),
legend.position = "bottom")+
xlab("")+
ylab(expression(dN/dS))+
ylim(0,20)+
guides(size = FALSE, fill = FALSE)+
geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
box.padding = 0.35, point.padding = 0.5,
segment.color = "#3F4656",
size = 3,
# Width of the line segments.
segment.size = 1,
# Draw an arrow from the label to the data point.
arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
nudge_x = -0.1,
nudge_y = 0.6,
force = 10
)
p_SCUO_posi3# Extract level_C labels of genes in top 5 dN/ds ratio
top <- 10
thresh <- sort(data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$ko_name
selected_KOlevels_labels[data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""
# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *",
replacement = "", selected_KOlevels_labels)
p_SCUO_posi4 <- data_posi_clade_KO[data_posi_clade_KO$level_B %in% selected_KOlevels, ] %>%
ggplot(aes(x = level_B, y = HA.foreground.omega,
fill = level_B))+
# geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
outlier.shape = NA)+
scale_fill_brewer(palette = "Set3")+
# scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
# scale_size_manual(values = c(2.5,4))+
theme_bw()+
# facet_wrap(Genome_ID~GCx)+
theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 60, hjust = 1),
strip.text.x=element_text(size=18),
legend.position = "bottom")+
xlab("")+
ylab(expression(dN/dS))+
ylim(0,20)+
guides(size = FALSE, fill = FALSE)+
geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
box.padding = 0.35, point.padding = 0.5,
segment.color = "#3F4656",
size = 3,
# Width of the line segments.
segment.size = 1,
# Draw an arrow from the label to the data point.
arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
nudge_x = -0.1,
nudge_y = 0.6,
force = 10
)
p_SCUO_posi4anvi-run-ncbi-cogs by blast searches.--use-ncbi-blast flag as recommended by the developers.for gene in `cat gene_list_pan.tsv`; do
anvi-get-dna-sequences-for-gene-calls -c 121950_assembled.db --gene-caller-ids $gene -o genes_tmp.fa
cat genes_tmp.fa >> genes_pan.fa
done
Pangenome visualized in anvio
panG <- read.table("./panG/SUMMARY_Ramli_PCs/panG-ramli_protein_clusters_summary.txt", header = TRUE, fill = TRUE, sep = "\t")[ , c("bin_name", "genome_name", "gene_callers_id", "COG_CATEGORY_ACC", "COG_CATEGORY", "COG_FUNCTION_ACC", "COG_FUNCTION", "aa_sequence")]
panG$aa_sequence <- gsub("-", "", panG$aa_sequence)
panG_MAG <- panG %>% filter(bin_name %in% c("MAG_PC", "Ramli_5-10_PC",
"Ramli_Leaf400_PC", "Ramli_TTB310_PC"))
panG_MAG$gene_callers_id <- as.character(panG_MAG$gene_callers_id)
write.table(paste(unique(panG_MAG$COG_FUNCTION_ACC), "W10", "#e2a2fd", sep=" "), "./panG/cog_id_panG.txt", row.names = FALSE,
quote = FALSE, col.names = FALSE)
write.table(paste(">", panG_MAG$gene_callers_id,
"\n", panG_MAG$aa_sequence, sep = ""),
"./panG/aaSeq_panG.fa", row.names = FALSE,
quote = FALSE, col.names = FALSE)
write.table(unique(panG_ko_cog$gene_callers_id),
"./panG/gene_ids_pan.tsv", row.names = FALSE,
quote = FALSE, col.names = FALSE)## Error in unique(panG_ko_cog$gene_callers_id): object 'panG_ko_cog' not found
# Import KEGG annotation through KAAS (http://www.genome.jp/tools/kaas/) of amino
# acid sequences
ko_files <- list.files(".", pattern = "KO-annotation.tsv",
recursive = TRUE)
panG_ko <- data.frame()
for(ko_file in ko_files){
tmp <- read.delim(ko_file, header = FALSE)
tmp <- data.frame(tmp, Genome = do.call(rbind, strsplit(ko_file, "_"))[,2])
if(ko_file == ko_files[1]) panG_ko <- tmp else{
panG_ko <- rbind(panG_ko, tmp)
}
}
colnames(panG_ko)[1:2] <- c("gene_id", "ko_id")
panG_ko <- panG_ko[panG_ko$ko_id != "",]
panG_ko$ko_id <- gsub(" ","", panG_ko$ko_id)
# Annotate KO_IDs with hierarchy
panG_ko <- dplyr::left_join(panG_ko, ko_path_df, by = c("ko_id" = "KO"))
# join with corresponding COG ids
panG_ko_cog <- dplyr::left_join(panG_MAG, panG_ko, by = c("gene_callers_id" = "gene_id"))
panG_ko_cog$level_B <- substring(panG_ko_cog$level_B, 7)
panG_ko_cog$level_C <- substring(panG_ko_cog$level_C, 7)
panG_ko_cog$level_A <- substring(panG_ko_cog$level_A, 7)
panG_ko_cog$level_C <- gsub("\\[[^\\]]*\\]", "", panG_ko_cog$level_C , perl=TRUE)
# Shorten/change level_B annotation a bit
panG_ko_cog$level_B[panG_ko_cog$level_B == "Cellular community - prokaryotes"] <- "Biofilm formation & quorum sensing"
panG_ko_cog$level_B[panG_ko_cog$level_B == "Xenobiotics biodegradation and metabolism"] <- "Xenobiotics degradation"
panG_ko_cog$level_C[panG_ko_cog$level_C == "Biofilm formation - Escherichia coli "] <- "Biofilm formation"
panG_ko_cog$level_C[panG_ko_cog$level_C == "Biofilm formation - Pseudomonas aeruginosa "] <- "Xenobiotics degradation"
tmp_names <- names(table(panG_ko_cog$level_B)[rev(order(table(panG_ko_cog$level_B)))][1:10])
# Plot distribution of panG annotation
p_panG1 <- panG_ko_cog %>% filter(level_B %in% tmp_names) %>%
ggplot(aes(x = level_A, fill = level_B))+
geom_bar(color = "black")+
theme_bw()+
scale_fill_brewer(palette="Paired")+
ggtitle("Number of genes")+
ylab("") + xlab("")+
facet_grid(genome_name~.)+
theme(axis.text=element_text(size=12.5), axis.title=element_text(18),
title=element_text(size=18), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text=element_text(size=18),
plot.margin = unit(c(1,1,1,1), "cm"), legend.title = element_blank()
# ,legend.position = c(0.87, 0.85)
)
print(p_panG1)# Now select the genes that are PSGs
blast_panG <- read.delim("./panG/genes_pan.blast", header = FALSE)
colnames(blast_panG) <- c("qseqid", "sseqid", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send",
"evalue", "bitscore")
blast_panG$qseqid <- do.call(rbind, strsplit(as.character(blast_panG$qseqid), split = "|", fixed = TRUE))[,1]pyani# read file
# data.ANI <- read.table("./ANI/ANIb_percentage_identity.tab")
data.ANI <- read.table("./ANI/ANIb_percentage_identity.tab")
# Replace genome names by better annotated names
map_ani <- read.delim("./Mapping_files/pyani_ref_names.tsv", stringsAsFactors = FALSE)
for(i in 1:nrow(map_ani)){
colnames(data.ANI)[colnames(data.ANI) %in% map_ani$ani_file[i]] <- map_ani$ref_name[i]
rownames(data.ANI)[rownames(data.ANI) %in% map_ani$ani_file[i]] <- map_ani$ref_name[i]
}
# Order y-axis according to phylogenetic tree order
ord_list_bin <- c("Lim. sp. Rim11", "Lim. sp. 103DPR2",
"Lim. sp. 2KL-27", "Lim. sp. Rim47",
"Lim. sp. II-D5", "Lim. sp. 2KL-3",
"Rhodo. sp. ED16","Rhodo. sp. T118",
"Curvi. sp. ATCC", "Curvi. sp. PAE-UM",
"Vario. sp. 110B", "Vario. sp. EPS",
"Ramli. sp. Leaf400", "Ramli. sp. TTB310",
"Ramli. sp. MAG",
"Ramli. sp. 5-10"
)
# order rows and columns
data.ANI <- data.ANI[, order(match(colnames(data.ANI), ord_list_bin))]
data.ANI <- data.ANI[order(match(rownames(data.ANI), ord_list_bin)), ]
data.ANI <- get_upper_tri(data.ANI)
# melt to dataframe
df_pyani <- melt(as.matrix(data.ANI), na.rm = TRUE) # reshape into dataframe
names(df_pyani)[c(1:3)] <- c("bin1", "bin2", "ANI")
df_pyani[, 1] <- as.character(df_pyani[, 1]); df_pyani[, 2] <- as.character(df_pyani[, 2])
df_pyani$bin2 <- factor(df_pyani$bin2, levels = ord_list_bin)
df_pyani$bin1 <- factor(df_pyani$bin1, levels = rev(ord_list_bin))
# make plot
p_ani <- ggplot(data = df_pyani,
aes(x = bin2, y = bin1, fill = ANI))+
theme(axis.title=element_text(size=16), strip.text.x=element_text(size=16),
legend.title=element_text(size=15), legend.text=element_text(size=14),
axis.text.y = element_blank(),
axis.text.x = element_text(size=16, angle = 55, hjust = 0),
title=element_text(size=20),
plot.margin = unit(c(1.1,1.1,1.1,1.1), "cm"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "transparent",colour = NA),
plot.background = element_rect(fill = "transparent",colour = NA)
)+
geom_raster()+
scale_fill_distiller(palette = "RdBu", name = "Average\nNucleotide\nIdentity\n",
limits = c(0.75,1.0), oob=squish) +
geom_text(aes(label = round(ANI, 2)), size = 4.5)+
xlab("")+
ylab("")+
scale_x_discrete(position = "top")
print(p_ani)Recyclerfiles <- list.files("./recycler_plasmid/", pattern="*_length_cov")
files <- paste0("./recycler_plasmid/", files)
df_plasm_16 <- read.table(files[1], header = FALSE)
df_plasm_17 <- read.table(files[2], header = FALSE)
df_plasm_64 <- read.table(files[3], header = FALSE)
df_plasm_65 <- read.table(files[4], header = FALSE)
df_plasm <- rbind(df_plasm_16, df_plasm_17, df_plasm_64, df_plasm_65)
df_plasm <- as.character(df_plasm$V1); tmp <- df_plasm
df_plasm <- do.call(rbind, strsplit(df_plasm, "_"))
df_plasm <- df_plasm[, -c(1,3,5,7,8)]
colnames(df_plasm) <- c("NODE_ID", "Length", "avg_coverage")
df_plasm <- apply(df_plasm, 2, function(x) as.integer(x))
df_plasm <- data.frame(df_plasm, node_IDs = tmp)
# Filter out plasmid scaffolds lower than 1000 bp
p_plasm1 <- df_plasm %>% filter(Length > 1000) %>%
ggplot(aes(x = Length/1000, y = sqrt(avg_coverage), fill = Length))+
geom_point(shape = 21, size = 4)+ theme_bw()+
# scale_x_log10()+
# scale_y_log10()+
scale_fill_continuous()+
ylab(expression(sqrt(Average_coverage)))+
xlab("Length (kbp)")+
ggtitle("Coverage-Length distribution of reconstructed plasmid fragments")
p_plasm1library("GSAR")